Creating new columns converted from the measure per 100 g to the measure per serving.



In [2]:

    
import pandas as pd



In [3]:

    
#Load dataset
data = pd.read_csv("demo_food_data.csv", sep = ",")



In [4]:

    
for c in data.columns:
    print(c)









    



Unnamed: 0
code
creator
product_name
generic_name
quantity
brands
brands_tags
categories
categories_tags
serving_size
energy_100g
fat_100g
cholesterol_100g
carbohydrates_100g
sugars_100g
starch_100g
fiber_100g
proteins_100g
salt_100g
sodium_100g
alcohol_100g
folates_100g
bicarbonate_100g
potassium_100g
chloride_100g
calcium_100g
iron_100g
fluoride_100g
iodine_100g
caffeine_100g
cocoa_100g
demo_cat



In [14]:

    
# Take a quick look
data.head()









    Out[14]:






  
    
      
      Unnamed: 0
      code
      creator
      product_name
      generic_name
      quantity
      brands
      brands_tags
      categories
      categories_tags
      ...
      bicarbonate_100g
      potassium_100g
      chloride_100g
      calcium_100g
      iron_100g
      fluoride_100g
      iodine_100g
      caffeine_100g
      cocoa_100g
      demo_cat
    
  
  
    
      0
      1816
      11110008817
      usda-ndb-import
      Maple Glazed Cake Donuts
      NaN
      NaN
      Kroger,  The Kroger Co.
      kroger,the-kroger-co
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      0.113
      0.00203
      NaN
      NaN
      NaN
      NaN
      donut
    
    
      1
      2271
      11110198990
      usda-ndb-import
      Cake Donuts, S'Mores
      NaN
      NaN
      Bakery Fresh Goodness,  The Kroger Co.
      bakery-fresh-goodness,the-kroger-co
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      0.065
      0.00232
      NaN
      NaN
      NaN
      NaN
      donut
    
    
      2
      5599
      11150146098
      usda-ndb-import
      Kids Graham Crackers, Chocolate
      NaN
      NaN
      Roundy's
      roundy-s
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      0.000
      0.00360
      NaN
      NaN
      NaN
      NaN
      cracker
    
    
      3
      6793
      11153041949
      usda-ndb-import
      Crunchy Oat Squares With Cinnamon Cereal
      NaN
      NaN
      Foodtown,  Foodtown  Inc.
      foodtown,foodtown-inc
      NaN
      NaN
      ...
      NaN
      0.438
      NaN
      0.312
      0.02250
      NaN
      NaN
      NaN
      NaN
      cereal
    
    
      4
      9226
      11225127700
      usda-ndb-import
      Sweet P's Bake Shop, Jumbo Glazed Raised Donuts
      NaN
      NaN
      Kingston Marketing Co
      kingston-marketing-co
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      0.103
      0.00138
      NaN
      NaN
      NaN
      NaN
      donut
    
  

5 rows × 33 columns

Now going to go through and find items that have certain category words in the product name. Then filter these to exclude the most often word that is confused in there (e.g. donut flavor coffee gets picked up under donut).

Then going to sort each of these based on the rank of items on key factors like sugar. And for each factor, going to pick items that are at specified percentiles, so we get a wide range on those factors.



In [5]:

    
#Define the columns we want to convert
original_names = ['energy_100g', 'fat_100g', 'cholesterol_100g','carbohydrates_100g','sugars_100g','starch_100g','fiber_100g',
              'proteins_100g','salt_100g','sodium_100g','alcohol_100g','folates_100g','bicarbonate_100g',
'potassium_100g','chloride_100g','calcium_100g','iron_100g','fluoride_100g','iodine_100g','caffeine_100g','cocoa_100g']

#Create new variable names
new_names=[]
for i in range(len(original_names)):
    new_names.append(original_names[i].split("_")[0])
    
#Parse the serving size and calculate the factor to convert 100 g to serving g
f = lambda x: (x["serving_size"].split(" g")[0])
data["serv_size"] = data.apply(f, axis=1)

f = lambda x: (float(x["serv_size"])/100)
data["serv_factor"] = data.apply(f, axis=1)


#For each target column, create a new column and calculate measure per serving
f = lambda x: (float(x["energy_100g"])*float(x["serv_factor"]))
data["energy"] = data.apply(f, axis=1)

f = lambda x: (float(x["fat_100g"])*float(x["serv_factor"]))
data["fat"] = data.apply(f, axis=1)

f = lambda x: (float(x["cholesterol_100g"])*float(x["serv_factor"]))
data["cholesterol"] = data.apply(f, axis=1)

f = lambda x: (float(x["carbohydrates_100g"])*float(x["serv_factor"]))
data["carbohydrates"] = data.apply(f, axis=1)

f = lambda x: (float(x["sugars_100g"])*float(x["serv_factor"]))
data["sugars"] = data.apply(f, axis=1)

f = lambda x: (float(x["starch_100g"])*float(x["serv_factor"]))
data["starch"] = data.apply(f, axis=1)

f = lambda x: (float(x["fiber_100g"])*float(x["serv_factor"]))
data["fiber"] = data.apply(f, axis=1)

f = lambda x: (float(x["proteins_100g"])*float(x["serv_factor"]))
data["proteins"] = data.apply(f, axis=1)

f = lambda x: (float(x["salt_100g"])*float(x["serv_factor"]))
data["salt"] = data.apply(f, axis=1)

f = lambda x: (float(x["sodium_100g"])*float(x["serv_factor"]))
data["sodium"] = data.apply(f, axis=1)

f = lambda x: (float(x["alcohol_100g"])*float(x["serv_factor"]))
data["alcohol"] = data.apply(f, axis=1)

f = lambda x: (float(x["folates_100g"])*float(x["serv_factor"]))
data["folates"] = data.apply(f, axis=1)

f = lambda x: (float(x["bicarbonate_100g"])*float(x["serv_factor"]))
data["bicarbonate"] = data.apply(f, axis=1)

f = lambda x: (float(x["potassium_100g"])*float(x["serv_factor"]))
data["potassium"] = data.apply(f, axis=1)

f = lambda x: (float(x["chloride_100g"])*float(x["serv_factor"]))
data["chloride"] = data.apply(f, axis=1)

f = lambda x: (float(x["calcium_100g"])*float(x["serv_factor"]))
data["calcium"] = data.apply(f, axis=1)

f = lambda x: (float(x["iron_100g"])*float(x["serv_factor"]))
data["iron"] = data.apply(f, axis=1)

f = lambda x: (float(x["fluoride_100g"])*float(x["serv_factor"]))
data["fluoride"] = data.apply(f, axis=1)

f = lambda x: (float(x["iodine_100g"])*float(x["serv_factor"]))
data["iodine"] = data.apply(f, axis=1)

f = lambda x: (float(x["caffeine_100g"])*float(x["serv_factor"]))
data["caffeine"] = data.apply(f, axis=1)

f = lambda x: (float(x["cocoa_100g"])*float(x["serv_factor"]))
data["cocoa"] = data.apply(f, axis=1)



In [6]:

    
# Take a look at what we built
data.head()









    Out[6]:






  
    
      
      Unnamed: 0
      code
      creator
      product_name
      generic_name
      quantity
      brands
      brands_tags
      categories
      categories_tags
      ...
      folates
      bicarbonate
      potassium
      chloride
      calcium
      iron
      fluoride
      iodine
      caffeine
      cocoa
    
  
  
    
      0
      1816
      11110008817
      usda-ndb-import
      Maple Glazed Cake Donuts
      NaN
      NaN
      Kroger,  The Kroger Co.
      kroger,the-kroger-co
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      0.08023
      0.001441
      NaN
      NaN
      NaN
      NaN
    
    
      1
      2271
      11110198990
      usda-ndb-import
      Cake Donuts, S'Mores
      NaN
      NaN
      Bakery Fresh Goodness,  The Kroger Co.
      bakery-fresh-goodness,the-kroger-co
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      0.04030
      0.001438
      NaN
      NaN
      NaN
      NaN
    
    
      2
      5599
      11150146098
      usda-ndb-import
      Kids Graham Crackers, Chocolate
      NaN
      NaN
      Roundy's
      roundy-s
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      0.00000
      0.001080
      NaN
      NaN
      NaN
      NaN
    
    
      3
      6793
      11153041949
      usda-ndb-import
      Crunchy Oat Squares With Cinnamon Cereal
      NaN
      NaN
      Foodtown,  Foodtown  Inc.
      foodtown,foodtown-inc
      NaN
      NaN
      ...
      NaN
      NaN
      0.14016
      NaN
      0.09984
      0.007200
      NaN
      NaN
      NaN
      NaN
    
    
      4
      9226
      11225127700
      usda-ndb-import
      Sweet P's Bake Shop, Jumbo Glazed Raised Donuts
      NaN
      NaN
      Kingston Marketing Co
      kingston-marketing-co
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      0.08034
      0.001076
      NaN
      NaN
      NaN
      NaN
    
  

5 rows × 56 columns



In [7]:

    
# Now write it out to disk
outfile = "demo_food_data_sd_regularized_columns.csv"
data.to_csv(outfile)

	Unnamed: 0	code	creator	product_name	generic_name	quantity	brands	brands_tags	categories	categories_tags	...	bicarbonate_100g	potassium_100g	chloride_100g	calcium_100g	iron_100g	fluoride_100g	iodine_100g	caffeine_100g	cocoa_100g	demo_cat
0	1816	11110008817	usda-ndb-import	Maple Glazed Cake Donuts	NaN	NaN	Kroger, The Kroger Co.	kroger,the-kroger-co	NaN	NaN	...	NaN	NaN	NaN	0.113	0.00203	NaN	NaN	NaN	NaN	donut
1	2271	11110198990	usda-ndb-import	Cake Donuts, S'Mores	NaN	NaN	Bakery Fresh Goodness, The Kroger Co.	bakery-fresh-goodness,the-kroger-co	NaN	NaN	...	NaN	NaN	NaN	0.065	0.00232	NaN	NaN	NaN	NaN	donut
2	5599	11150146098	usda-ndb-import	Kids Graham Crackers, Chocolate	NaN	NaN	Roundy's	roundy-s	NaN	NaN	...	NaN	NaN	NaN	0.000	0.00360	NaN	NaN	NaN	NaN	cracker
3	6793	11153041949	usda-ndb-import	Crunchy Oat Squares With Cinnamon Cereal	NaN	NaN	Foodtown, Foodtown Inc.	foodtown,foodtown-inc	NaN	NaN	...	NaN	0.438	NaN	0.312	0.02250	NaN	NaN	NaN	NaN	cereal
4	9226	11225127700	usda-ndb-import	Sweet P's Bake Shop, Jumbo Glazed Raised Donuts	NaN	NaN	Kingston Marketing Co	kingston-marketing-co	NaN	NaN	...	NaN	NaN	NaN	0.103	0.00138	NaN	NaN	NaN	NaN	donut